import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"
netflix_data = pd.read_csv("C:/Users/hussien/OneDrive/Desktop/netflix_content_2023.csv")
netflix_data.head()
| Title | Available Globally? | Release Date | Hours Viewed | Language Indicator | Content Type | |
|---|---|---|---|---|---|---|
| 0 | The Night Agent: Season 1 | Yes | 2023-03-23 | 81,21,00,000 | English | Show |
| 1 | Ginny & Georgia: Season 2 | Yes | 2023-01-05 | 66,51,00,000 | English | Show |
| 2 | The Glory: Season 1 // 더 글로리: 시즌 1 | Yes | 2022-12-30 | 62,28,00,000 | Korean | Show |
| 3 | Wednesday: Season 1 | Yes | 2022-11-23 | 50,77,00,000 | English | Show |
| 4 | Queen Charlotte: A Bridgerton Story | Yes | 2023-05-04 | 50,30,00,000 | English | Movie |
netflix_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24812 entries, 0 to 24811 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 24812 non-null object 1 Available Globally? 24812 non-null object 2 Release Date 8166 non-null object 3 Hours Viewed 24812 non-null object 4 Language Indicator 24812 non-null object 5 Content Type 24812 non-null object dtypes: object(6) memory usage: 1.1+ MB
def clean_strings(series):
"""
Clean and standardize strings in a Series.
Args:
series (pd.Series): The input Series with strings.
Returns:
pd.Series: Series with cleaned and standardized strings.
"""
cleaned_series = series.str.replace(',', '', regex=True).astype(float)
return cleaned_series
# Usage
netflix_data['Hours Viewed'] = clean_strings(netflix_data['Hours Viewed'])
# aggregate viewership hours by content type and return the figure
def aggregate_viewership_hours(dataframe,group_column,agg_column,aggregate_function):
"""
Aggregate viewership hours by content type.
Aggregate categorical data based on a grouping column.
Args:
dataframe (pd.DataFrame): The input DataFrame.
group_column (str): Column for grouping data.
agg_column (str): Column to perform aggregation on.
aggregation (str): Aggregation function ('count', 'sum', 'mean', 'median').
Returns:
pd.DataFrame: Aggregated DataFrame."""
if aggregate_function == 'count':
aggregated_data = dataframe.groupby(group_column)[agg_column].count()
elif aggregate_function == 'sum':
aggregated_data = dataframe.groupby(group_column)[agg_column].sum()
elif aggregate_function == 'mean':
aggregated_data = dataframe.groupby(group_column)[agg_column].mean()
elif aggregate_function == 'median':
aggregated_data = dataframe.groupby(group_column)[agg_column].median()
else:
raise ValueError("Invalid aggregation function.")
return aggregated_data.reset_index()
content_type_viewership=aggregate_viewership_hours(netflix_data,'Content Type','Hours Viewed','sum')
## visulizate the total viewership hours by content type
fig = px.pie(content_type_viewership, values='Hours Viewed', names='Content Type', title='Total Viewership Hours by Content Type')
fig.show()
# aggregate viewership hours by language
language_viewership = aggregate_viewership_hours(netflix_data,'Language Indicator','Hours Viewed','sum')
language_viewership.sort_values(by='Hours Viewed',ascending=False,inplace=True)
fig = go.Figure(data=[
go.Bar(
x=language_viewership.index,
y=language_viewership.values,
marker_color='lightcoral'
)
])
fig.update_layout(
title='Total Viewership Hours by Language (2023)',
xaxis_title='Language',
yaxis_title='Total Hours Viewed (in billions)',
xaxis_tickangle=45,
height=600,
width=1000
)
fig.show()
# convert the "Release Date" to a datetime format and extract the month
netflix_data['Release Date'] = pd.to_datetime(netflix_data['Release Date'])
netflix_data['Release Month'] = netflix_data['Release Date'].dt.month
# aggregate viewership hours by release month
monthly_viewership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()
fig = go.Figure(data=[
go.Scatter(
x=monthly_viewership.index,
y=monthly_viewership.values,
mode='lines+markers',
marker=dict(color='blue'),
line=dict(color='blue')
)
])
fig.update_layout(
title='Total Viewership Hours by Release Month (2023)',
xaxis_title='Month',
yaxis_title='Total Hours Viewed (in billions)',
xaxis=dict(
tickmode='array',
tickvals=list(range(1, 13)),
ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
),
height=600,
width=1000
)
fig.show()
top_5_titles = netflix_data.nlargest(5, 'Hours Viewed')
top_5_titles[['Title', 'Hours Viewed', 'Language Indicator', 'Content Type', 'Release Date']]
| Title | Hours Viewed | Language Indicator | Content Type | Release Date | |
|---|---|---|---|---|---|
| 0 | The Night Agent: Season 1 | 812100000.0 | English | Show | 2023-03-23 |
| 1 | Ginny & Georgia: Season 2 | 665100000.0 | English | Show | 2023-01-05 |
| 18227 | King the Land: Limited Series // 킹더랜드: 리미티드 시리즈 | 630200000.0 | Korean | Movie | 2023-06-17 |
| 2 | The Glory: Season 1 // 더 글로리: 시즌 1 | 622800000.0 | Korean | Show | 2022-12-30 |
| 18214 | ONE PIECE: Season 1 | 541900000.0 | English | Show | 2023-08-31 |
## # aggregate viewership hours by content type and release month by use the pivot table
monthly_viewership_by_type = netflix_data.pivot_table(index='Release Month',
columns='Content Type',
values='Hours Viewed',
aggfunc='sum')
monthly_viewership_by_type
| Content Type | Movie | Show |
|---|---|---|
| Release Month | ||
| 1.0 | 2.275900e+09 | 4.995700e+09 |
| 2.0 | 1.654400e+09 | 5.449300e+09 |
| 3.0 | 2.109400e+09 | 5.327700e+09 |
| 4.0 | 2.757600e+09 | 4.108100e+09 |
| 5.0 | 2.520500e+09 | 4.574100e+09 |
| 6.0 | 3.135800e+09 | 5.386200e+09 |
| 7.0 | 1.615700e+09 | 4.909100e+09 |
| 8.0 | 2.186400e+09 | 4.631400e+09 |
| 9.0 | 2.092300e+09 | 5.169900e+09 |
| 10.0 | 3.400400e+09 | 4.722800e+09 |
| 11.0 | 1.866900e+09 | 5.882600e+09 |
| 12.0 | 2.554900e+09 | 7.500900e+09 |
## visulize the pivot table
fig = go.Figure()
for content_type in monthly_viewership_by_type.columns:
fig.add_trace(
go.Scatter(
x=monthly_viewership_by_type.index,
y=monthly_viewership_by_type[content_type],
mode='lines+markers',
name=content_type
)
)
fig.update_layout(
title='Viewership Trends by Content Type and Release Month (2023)',
xaxis_title='Month',
yaxis_title='Total Hours Viewed (in billions)',
xaxis=dict(
tickmode='array',
tickvals=list(range(1, 13)),
ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
),
height=600,
width=1000,
legend_title='Content Type'
)
fig.show()
# define seasons based on release months
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
# apply the season categorization to the dataset
netflix_data['Release Season'] = netflix_data['Release Month'].apply(get_season)
# aggregate viewership hours by release season
seasonal_viewership = netflix_data.groupby('Release Season')['Hours Viewed'].sum()
# order the seasons as 'Winter', 'Spring', 'Summer', 'Fall'
seasons_order = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_viewership = seasonal_viewership.reindex(seasons_order)
fig = go.Figure(data=[
go.Bar(
x=seasonal_viewership.index,
y=seasonal_viewership.values,
marker_color='orange'
)
])
fig.update_layout(
title='Total Viewership Hours by Release Season (2023)',
xaxis_title='Season',
yaxis_title='Total Hours Viewed (in billions)',
xaxis_tickangle=0,
height=500,
width=800,
xaxis=dict(
categoryorder='array',
categoryarray=seasons_order
)
)
fig.show()
monthly_releases = netflix_data['Release Month'].value_counts().sort_index()
monthly_viewership = netflix_data.groupby('Release Month')['Hours Viewed'].sum()
fig = go.Figure()
fig.add_trace(
go.Bar(
x=monthly_releases.index,
y=monthly_releases.values,
name='Number of Releases',
marker_color='goldenrod',
opacity=0.7,
yaxis='y1'
)
)
fig.add_trace(
go.Scatter(
x=monthly_viewership.index,
y=monthly_viewership.values,
name='Viewership Hours',
mode='lines+markers',
marker=dict(color='red'),
line=dict(color='red'),
yaxis='y2'
)
)
fig.update_layout(
title='Monthly Release Patterns and Viewership Hours (2023)',
xaxis=dict(
title='Month',
tickmode='array',
tickvals=list(range(1, 13)),
ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
),
yaxis=dict(
title='Number of Releases',
showgrid=False,
side='left'
),
yaxis2=dict(
title='Total Hours Viewed (in billions)',
overlaying='y',
side='right',
showgrid=False
),
legend=dict(
x=1.05,
y=1,
orientation='v',
xanchor='left'
),
height=600,
width=1000
)
fig.show()
# define significant holidays and events in 2023
important_dates = [
'2023-01-01', # new year's day
'2023-02-14', # valentine's ay
'2023-07-04', # independence day (US)
'2023-10-31', # halloween
'2023-12-25' # christmas day
]
# convert to datetime
important_dates = pd.to_datetime(important_dates)
# check for content releases close to these significant holidays (within a 3-day window)
holiday_releases = netflix_data[netflix_data['Release Date'].apply(
lambda x: any((x - date).days in range(-3, 4) for date in important_dates)
)]
# aggregate viewership hours for releases near significant holidays
holiday_viewership = holiday_releases.groupby('Release Date')['Hours Viewed'].sum()
holiday_releases[['Title', 'Release Date', 'Hours Viewed']]
| Title | Release Date | Hours Viewed | |
|---|---|---|---|
| 2 | The Glory: Season 1 // 더 글로리: 시즌 1 | 2022-12-30 | 622800000.0 |
| 6 | La Reina del Sur: Season 3 | 2022-12-30 | 429600000.0 |
| 11 | Kaleidoscope: Limited Series | 2023-01-01 | 252500000.0 |
| 29 | Perfect Match: Season 1 | 2023-02-14 | 176800000.0 |
| 124 | Lady Voyeur: Limited Series // Olhar Indiscret... | 2022-12-31 | 86000000.0 |
| ... | ... | ... | ... |
| 22324 | The Romantics: Limited Series | 2023-02-14 | 1000000.0 |
| 22327 | Aggretsuko: Season 5 // アグレッシブ烈子: シーズン5 | 2023-02-16 | 900000.0 |
| 22966 | The Lying Life of Adults: Limited Series // La... | 2023-01-04 | 900000.0 |
| 22985 | Community Squad: Season 1 // División Palermo:... | 2023-02-17 | 800000.0 |
| 24187 | Live to Lead: Limited Series | 2022-12-31 | 400000.0 |
98 rows × 3 columns